import docx2txt
import textract

from abc import ABC
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders import Docx2txtLoader as sys_Docx2txtLoader


def doc2txt(path) -> str:
    if path.endswith(".doc"):
        # 这个包将文档表格解析成一行，如果需要特殊处理，可以参考
        return textract.process(path).decode()
    return docx2txt.process(path)


class Docx2txtLoader(sys_Docx2txtLoader, ABC):

    def load(self) -> List[Document]:
        """Load given path as single page."""
        return [
            Document(
                page_content=doc2txt(self.file_path),
                metadata={"source": self.file_path},
            )
        ]
